Loading libraries
library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.0
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:formattable':
##
## style
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
pinky <- read_csv("pinkyblastp_top50.txt",
col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 8025 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pinky
## # A tibble: 8,025 × 7
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 NODE_795_length_49550_cov_5.… ref|Y… 0 1932881 Pacman… Viruses Hypot…
## 2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
## 3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
## 4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
## 5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
## 6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
## 7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
## 8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
## 9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 8,015 more rows, and abbreviated variable names ¹​sscinames,
## # ²​sskingdoms
Filtering e-values with hits lower than 1e-10
pinky_evalue <- pinky %>%
filter(as.numeric(evalue) <= 1e-10)
pinky_evalue
## # A tibble: 3,923 × 7
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 NODE_795_length_49550_cov_5.… ref|Y… 0 1932881 Pacman… Viruses Hypot…
## 2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
## 3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
## 4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
## 5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
## 6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
## 7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
## 8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
## 9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 3,913 more rows, and abbreviated variable names ¹​sscinames,
## # ²​sskingdoms
pinky_duplicates <- pinky_evalue %>%
group_by(qseqid) %>%
distinct(sseqid, .keep_all = TRUE)
print(pinky_duplicates)
## # A tibble: 3,689 × 7
## # Groups: qseqid [132]
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 NODE_795_length_49550_cov_5.… ref|Y… 0 1932881 Pacman… Viruses Hypot…
## 2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
## 3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
## 4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
## 5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
## 6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
## 7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
## 8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
## 9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 3,679 more rows, and abbreviated variable names ¹​sscinames,
## # ²​sskingdoms
Renaming qseqids the product names produced on Prokka and hallmark
proteins
library(dplyr)
rename_values <- c(
"NODE_2150_length_28922_cov_4.964458_8" = "A32",
"NODE_5244_length_16314_cov_5.483917_1" = "D5",
"NODE_22952_length_5851_cov_5.322809_7" = "D5",
"NODE_795_length_49550_cov_5.527831_39" = "mRNAc",
"NODE_2253_length_28109_cov_5.502032_7" = "PolB",
"NODE_795_length_49550_cov_5.527831_38" = "RNAPL",
"NODE_1151_length_41258_cov_5.297770_55" = "RNAPS",
"NODE_4708_length_17479_cov_5.389635_11" = "RNR",
"NODE_936_length_45786_cov_4.756839_10" = "SFII",
"NODE_2150_length_28922_cov_4.964458_11" = "VLTF3",
"NODE_795_length_49550_cov_5.527831_2" = "Polyprotein pp62",
"NODE_795_length_49550_cov_5.527831_20" = "DNA Ligase",
"NODE_795_length_49550_cov_5.527831_49" = "mRNA-decapping protein g5R",
"NODE_936_length_45786_cov_4.756839_8" = "putative AP endonuclease",
"NODE_936_length_45786_cov_4.756839_17" = "Thymidylate synthase",
"NODE_1151_length_41258_cov_5.297770_33" = "Deoxyuridine 5'-triphosphate nucleotidohydrolase"
)
# Replacing values in the 'qseqid' column in 'data_evalue' dataframe
pinky_duplicates$qseqid <- ifelse(pinky_duplicates$qseqid %in% names(rename_values), rename_values[pinky_duplicates$qseqid], pinky_duplicates$qseqid)
pinky_kingdom <- pinky_evalue %>%
count(sskingdoms)
pinky_kingdom
## # A tibble: 6 × 2
## sskingdoms n
## <chr> <int>
## 1 Archaea 161
## 2 Archaea;Bacteria 1
## 3 Bacteria 948
## 4 Eukaryota 639
## 5 N/A 4
## 6 Viruses 2170
piechart_pinky <- ggplot(pinky_kingdom, aes(x="", y=n, fill=sskingdoms)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
scale_fill_brewer(palette = "PiYG") +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
piechart_pinky

pinky_eukaryotes <- pinky_duplicates %>%
filter(sskingdoms == "Eukaryota") %>%
group_by(qseqid, sseqid, staxids) %>%
slice_min(order_by = evalue) %>%
ungroup() %>%
mutate(genus = word(sscinames, 1))
pinky_eukaryotes
## # A tibble: 618 × 8
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 A32 dbj|G… 1.64e- 49 1093978 Elysia… Eukary… BA71V… Elys…
## 2 D5 dbj|G… 1.18e-165 1093978 Elysia… Eukary… BA71V… Elys…
## 3 DNA Ligase dbj|G… 1.17e- 42 1093978 Elysia… Eukary… DNA l… Elys…
## 4 DNA Ligase emb|C… 1.14e- 22 27381 Funnel… Eukary… 9786_… Funn…
## 5 DNA Ligase gb|KA… 2.33e- 25 82926 Globis… Eukary… hypot… Glob…
## 6 DNA Ligase gb|KA… 2.97e- 24 29920 Phytop… Eukary… hypot… Phyt…
## 7 DNA Ligase gb|KA… 3.4 e- 31 164328 Phytop… Eukary… DNA l… Phyt…
## 8 DNA Ligase gb|KA… 4.01e- 24 164328 Phytop… Eukary… DNA l… Phyt…
## 9 Deoxyuridine 5'-tripho… gb|EM… 2.98e- 24 885315 Entamo… Eukary… deoxy… Enta…
## 10 Deoxyuridine 5'-tripho… gb|KA… 2.28e- 25 117179 Tillet… Eukary… hypot… Till…
## # … with 608 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
pinky_egenus_counts <- pinky_eukaryotes %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(pinky_egenus_counts)
## # A tibble: 98 × 3
## # Groups: qseqid, genus [98]
## qseqid genus count
## <chr> <chr> <int>
## 1 DNA Ligase Phytophthora 3
## 2 Deoxyuridine 5'-triphosphate nucleotidohydrolase Tilletia 5
## 3 NODE_1151_length_41258_cov_5.297770_24 Macrostomum 12
## 4 NODE_1507_length_35821_cov_5.845160_24 Entrophospora 2
## 5 NODE_1507_length_35821_cov_5.845160_24 Rhizophagus 45
## 6 NODE_15262_length_7722_cov_5.326203_6 Perkinsus 3
## 7 NODE_2150_length_28922_cov_4.964458_23 Phytophthora 5
## 8 NODE_2253_length_28109_cov_5.502032_1 Elysia 2
## 9 NODE_2253_length_28109_cov_5.502032_10 Tetrabaena 2
## 10 NODE_2253_length_28109_cov_5.502032_3 Trametes 2
## # … with 88 more rows
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue", "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
pinkyvirus_eukaryotes_ggplot <- ggplot(pinky_egenus_counts, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = distinct_colors) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
pinkyvirus_eukaryotes_ggplot

ggplotly(pinkyvirus_eukaryotes_ggplot)
eukaryotes_pinkyggplot <- ggplot(pinky_egenus_counts, aes(x = genus, y = count)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
theme_minimal() +
theme(text = element_text(size = 12), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
eukaryotes_pinkyggplot

Rhizophagus - Homology due to Horizontal Gene Transfer?
rhizophagus_pinky <- pinky_eukaryotes %>%
filter(genus == "Rhizophagus")
rhizophagus_pinky
## # A tibble: 92 × 8
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 NODE_1507_length_35821_… dbj|G… 2.34e-53 747089 Rhizop… Eukary… poxvi… Rhiz…
## 2 NODE_1507_length_35821_… dbj|G… 1.73e-54 747089 Rhizop… Eukary… poxvi… Rhiz…
## 3 NODE_1507_length_35821_… dbj|G… 6.29e-52 747089 Rhizop… Eukary… poxvi… Rhiz…
## 4 NODE_1507_length_35821_… dbj|G… 1.48e-53 747089 Rhizop… Eukary… poxvi… Rhiz…
## 5 NODE_1507_length_35821_… dbj|G… 4.87e-51 747089 Rhizop… Eukary… poxvi… Rhiz…
## 6 NODE_1507_length_35821_… dbj|G… 2 e-50 747089 Rhizop… Eukary… poxvi… Rhiz…
## 7 NODE_1507_length_35821_… dbj|G… 8.67e-51 747089 Rhizop… Eukary… poxvi… Rhiz…
## 8 NODE_1507_length_35821_… dbj|G… 8.78e-51 747089 Rhizop… Eukary… poxvi… Rhiz…
## 9 NODE_1507_length_35821_… dbj|G… 7.45e-52 747089 Rhizop… Eukary… poxvi… Rhiz…
## 10 NODE_1507_length_35821_… dbj|G… 6.05e-52 747089 Rhizop… Eukary… poxvi… Rhiz…
## # … with 82 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
rhizophagus_pinky$sseqid
## [1] "dbj|GBC12205.2|" "dbj|GBC13564.1|" "dbj|GBC21622.1|"
## [4] "dbj|GBC31224.2|" "dbj|GBC35452.2|" "dbj|GBC39224.2|"
## [7] "dbj|GBC47585.1|" "dbj|GET50588.1|" "dbj|GET50592.1|"
## [10] "dbj|GET50593.1|" "dbj|GET51853.1|" "dbj|GET51855.1|"
## [13] "dbj|GET51856.1|" "dbj|GET51873.1|" "dbj|GET51875.1|"
## [16] "dbj|GET51876.1|" "dbj|GET52808.1|" "dbj|GET54086.1|"
## [19] "dbj|GET54089.1|" "dbj|GET54174.1|" "dbj|GET56822.1|"
## [22] "dbj|GET56825.1|" "dbj|GET56826.1|" "dbj|GET57267.1|"
## [25] "dbj|GET57268.1|" "dbj|GET57269.1|" "dbj|GET57381.1|"
## [28] "dbj|GET57383.1|" "dbj|GET63053.1|" "dbj|GET66458.1|"
## [31] "dbj|GET66459.1|" "dbj|GET67080.1|" "dbj|GET67081.1|"
## [34] "dbj|GET67083.1|" "dbj|GET67084.1|" "dbj|GET67086.1|"
## [37] "dbj|GET67348.1|" "dbj|GET67349.1|" "emb|CAB4403144.1|"
## [40] "emb|CAB4418995.1|" "emb|CAB4443500.1|" "emb|CAB4446634.1|"
## [43] "emb|CAB4479734.1|" "gb|EXX63233.1|" "gb|PKY27577.1|"
## [46] "dbj|GBC36622.2|" "emb|CAB5185030.1|" "gb|EXX73414.1|"
## [49] "gb|PKC09441.1|" "gb|PKY22961.1|" "ref|XP_025173077.1|"
## [52] "emb|CAB5357217.1|" "emb|CAG8693468.1|" "emb|CAG8734086.1|"
## [55] "gb|UZO10893.1|" "ref|XP_025164841.1|" "dbj|GBB89267.1|"
## [58] "dbj|GES93775.1|" "dbj|GBB95954.1|" "dbj|GBC22393.2|"
## [61] "dbj|GES89815.1|" "dbj|GES94728.1|" "dbj|GES96473.1|"
## [64] "dbj|GES96599.1|" "emb|CAB4402555.1|" "emb|CAB4403248.1|"
## [67] "emb|CAB4474595.1|" "emb|CAB4476463.1|" "emb|CAB4480535.1|"
## [70] "emb|CAB4492557.1|" "emb|CAB5186435.1|" "emb|CAB5205225.1|"
## [73] "emb|CAB5208415.1|" "emb|CAG8744073.1|" "gb|PKB96922.1|"
## [76] "gb|PKC08263.1|" "gb|PKC54175.1|" "gb|PKC57870.1|"
## [79] "gb|PKK58991.1|" "gb|PKK60440.1|" "gb|PKK63436.1|"
## [82] "gb|RGB22550.1|" "ref|XP_025181784.1|" "emb|CAB5212864.1|"
## [85] "emb|CAB5383446.1|" "gb|EXX55117.1|" "gb|UZO20410.1|"
## [88] "ref|XP_025171547.1|" "emb|CAB4481493.1|" "gb|PKC17413.1|"
## [91] "gb|PKC65499.1|" "gb|PKY14747.1|"
# Count the number of times each genus appears for a specific protein sequence
rhizophagus_counts_pinky <- rhizophagus_pinky %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(rhizophagus_counts_pinky)
## # A tibble: 8 × 3
## # Groups: qseqid, genus [8]
## qseqid genus count
## <chr> <chr> <int>
## 1 NODE_1507_length_35821_cov_5.845160_24 Rhizophagus 45
## 2 NODE_4293_length_18587_cov_5.277736_15 Rhizophagus 6
## 3 NODE_4708_length_17479_cov_5.389635_18 Rhizophagus 5
## 4 NODE_936_length_45786_cov_4.756839_31 Rhizophagus 2
## 5 PolB Rhizophagus 25
## 6 Polyprotein pp62 Rhizophagus 2
## 7 mRNA-decapping protein g5R Rhizophagus 3
## 8 mRNAc Rhizophagus 4
dc <- c("#DE77AE")
rhizophagus_ggplot <- ggplot(rhizophagus_counts_pinky, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = dc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
rhizophagus_ggplot

library(rentrez)
# Function to retrieve BioSample accession for a given protein accession
get_biosample_accession <- function(rhizophagusPinky_accessions) {
entrez_query <- paste0(rhizophagusPinky_accessions, "[ACCN]")
entrez_result <- entrez_fetch(db = "protein", id = entrez_query, rettype = "gb", retmode = "text")
biosample_accession <- regmatches(entrez_result, regexpr("BioSample: (\\w+)", entrez_result))
return(biosample_accession)
}
# Example protein accession numbers
rhizophagusPinky_accessions <- c(
"dbj|GBC12205.2|", "dbj|GBC13564.1|", "dbj|GBC21622.1|", "dbj|GBC31224.2|", "dbj|GBC35452.2|",
"dbj|GBC39224.2|", "dbj|GBC47585.1|", "dbj|GET50588.1|", "dbj|GET50592.1|", "dbj|GET50593.1|",
"dbj|GET51853.1|", "dbj|GET51855.1|", "dbj|GET51856.1|", "dbj|GET51873.1|", "dbj|GET51875.1|",
"dbj|GET51876.1|", "dbj|GET52808.1|", "dbj|GET54086.1|", "dbj|GET54089.1|", "dbj|GET54174.1|",
"dbj|GET56822.1|", "dbj|GET56825.1|", "dbj|GET56826.1|", "dbj|GET57267.1|", "dbj|GET57268.1|",
"dbj|GET57269.1|", "dbj|GET57381.1|", "dbj|GET57383.1|", "dbj|GET63053.1|", "dbj|GET66458.1|",
"dbj|GET66459.1|", "dbj|GET67080.1|", "dbj|GET67081.1|", "dbj|GET67083.1|", "dbj|GET67084.1|",
"dbj|GET67086.1|", "dbj|GET67348.1|", "dbj|GET67349.1|", "emb|CAB4403144.1|", "emb|CAB4418995.1|",
"emb|CAB4443500.1|", "emb|CAB4446634.1|", "emb|CAB4479734.1|", "gb|EXX63233.1|", "gb|PKY27577.1|",
"dbj|GBC36622.2|", "emb|CAB5185030.1|", "gb|EXX73414.1|", "gb|PKC09441.1|", "gb|PKY22961.1|",
"ref|XP_025173077.1|", "emb|CAB5357217.1|", "emb|CAG8693468.1|", "emb|CAG8734086.1|", "gb|UZO10893.1|",
"ref|XP_025164841.1|", "dbj|GBB89267.1|", "dbj|GES93775.1|", "dbj|GBB95954.1|", "dbj|GBC22393.2|",
"dbj|GES89815.1|", "dbj|GES94728.1|", "dbj|GES96473.1|", "dbj|GES96599.1|", "emb|CAB4402555.1|",
"emb|CAB4403248.1|", "emb|CAB4474595.1|", "emb|CAB4476463.1|", "emb|CAB4480535.1|", "emb|CAB4492557.1|",
"emb|CAB5186435.1|", "emb|CAB5205225.1|", "emb|CAB5208415.1|", "emb|CAG8744073.1|", "gb|PKB96922.1|",
"gb|PKC08263.1|", "gb|PKC54175.1|", "gb|PKC57870.1|", "gb|PKK58991.1|", "gb|PKK60440.1|", "gb|PKK63436.1|",
"gb|RGB22550.1|", "ref|XP_025181784.1|", "emb|CAB5212864.1|", "emb|CAB5383446.1|", "gb|EXX55117.1|",
"gb|UZO20410.1|", "ref|XP_025171547.1|", "emb|CAB4481493.1|", "gb|PKC17413.1|", "gb|PKC65499.1|", "gb|PKY14747.1|"
)
# Retrieve the BioSample accession for the given protein accessions
biosample_accessions <- sapply(rhizophagusPinky_accessions, get_biosample_accession)
# Print the result
print(biosample_accessions)
## dbj|GBC12205.2| dbj|GBC13564.1| dbj|GBC21622.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GBC31224.2| dbj|GBC35452.2| dbj|GBC39224.2|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GBC47585.1| dbj|GET50588.1| dbj|GET50592.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET50593.1| dbj|GET51853.1| dbj|GET51855.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET51856.1| dbj|GET51873.1| dbj|GET51875.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET51876.1| dbj|GET52808.1| dbj|GET54086.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET54089.1| dbj|GET54174.1| dbj|GET56822.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET56825.1| dbj|GET56826.1| dbj|GET57267.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET57268.1| dbj|GET57269.1| dbj|GET57381.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET57383.1| dbj|GET63053.1| dbj|GET66458.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET66459.1| dbj|GET67080.1| dbj|GET67081.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET67083.1| dbj|GET67084.1| dbj|GET67086.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945"
## dbj|GET67348.1| dbj|GET67349.1| emb|CAB4403144.1|
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMEA5841291"
## emb|CAB4418995.1| emb|CAB4443500.1| emb|CAB4446634.1|
## "BioSample: SAMEA5841291" "BioSample: SAMEA5841291" "BioSample: SAMEA5841291"
## emb|CAB4479734.1| gb|EXX63233.1| gb|PKY27577.1|
## "BioSample: SAMEA5841244" "BioSample: SAMN02422737" "BioSample: SAMN04195446"
## dbj|GBC36622.2| emb|CAB5185030.1| gb|EXX73414.1|
## "BioSample: SAMD00053945" "BioSample: SAMEA5841246" "BioSample: SAMN02422737"
## gb|PKC09441.1| gb|PKY22961.1| ref|XP_025173077.1|
## "BioSample: SAMN04195397" "BioSample: SAMN04195446" "BioSample: SAMN02744054"
## emb|CAB5357217.1| emb|CAG8693468.1| emb|CAG8734086.1|
## "BioSample: SAMEA5841255" "BioSample: SAMEA8911290" "BioSample: SAMEA8911290"
## gb|UZO10893.1| ref|XP_025164841.1| dbj|GBB89267.1|
## "BioSample: SAMN31081226" "BioSample: SAMN02744054" "BioSample: SAMD00098044"
## dbj|GES93775.1| dbj|GBB95954.1| dbj|GBC22393.2|
## "BioSample: SAMD00190756" "BioSample: SAMD00098044" "BioSample: SAMD00053945"
## dbj|GES89815.1| dbj|GES94728.1| dbj|GES96473.1|
## "BioSample: SAMD00190756" "BioSample: SAMD00190756" "BioSample: SAMD00190756"
## dbj|GES96599.1| emb|CAB4402555.1| emb|CAB4403248.1|
## "BioSample: SAMD00190756" "BioSample: SAMEA5841291" "BioSample: SAMEA5841291"
## emb|CAB4474595.1| emb|CAB4476463.1| emb|CAB4480535.1|
## "BioSample: SAMEA5841244" "BioSample: SAMEA5841244" "BioSample: SAMEA5841244"
## emb|CAB4492557.1| emb|CAB5186435.1| emb|CAB5205225.1|
## "BioSample: SAMEA5841244" "BioSample: SAMEA5841246" "BioSample: SAMEA5841246"
## emb|CAB5208415.1| emb|CAG8744073.1| gb|PKB96922.1|
## "BioSample: SAMEA5841246" "BioSample: SAMEA8911290" "BioSample: SAMN04195397"
## gb|PKC08263.1| gb|PKC54175.1| gb|PKC57870.1|
## "BioSample: SAMN04195397" "BioSample: SAMN04195188" "BioSample: SAMN04195188"
## gb|PKK58991.1| gb|PKK60440.1| gb|PKK63436.1|
## "BioSample: SAMN04195450" "BioSample: SAMN04195450" "BioSample: SAMN04195450"
## gb|RGB22550.1| ref|XP_025181784.1| emb|CAB5212864.1|
## "BioSample: SAMN08364867" "BioSample: SAMN02744054" "BioSample: SAMEA5841246"
## emb|CAB5383446.1| gb|EXX55117.1| gb|UZO20410.1|
## "BioSample: SAMEA5841255" "BioSample: SAMN02422737" "BioSample: SAMN31081226"
## ref|XP_025171547.1| emb|CAB4481493.1| gb|PKC17413.1|
## "BioSample: SAMN02744054" "BioSample: SAMEA5841244" "BioSample: SAMN04195397"
## gb|PKC65499.1| gb|PKY14747.1|
## "BioSample: SAMN04195188" "BioSample: SAMN04195446"
# Retrieve the BioSample accessions for the sseqid column
rhizophagus_pinky$biosample_accession <- sapply(rhizophagus_pinky$sseqid, get_biosample_accession)
# Print the updated data
print(rhizophagus_pinky)
## # A tibble: 92 × 9
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus biosa…³
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NODE_1507_lengt… dbj|G… 2.34e-53 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 2 NODE_1507_lengt… dbj|G… 1.73e-54 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 3 NODE_1507_lengt… dbj|G… 6.29e-52 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 4 NODE_1507_lengt… dbj|G… 1.48e-53 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 5 NODE_1507_lengt… dbj|G… 4.87e-51 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 6 NODE_1507_lengt… dbj|G… 2 e-50 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 7 NODE_1507_lengt… dbj|G… 8.67e-51 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 8 NODE_1507_lengt… dbj|G… 8.78e-51 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 9 NODE_1507_lengt… dbj|G… 7.45e-52 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 10 NODE_1507_lengt… dbj|G… 6.05e-52 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## # … with 82 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## # ³​biosample_accession
# Removing repeated Biosample IDs.
rhizophagus_biosample <- rhizophagus_pinky %>%
group_by(qseqid) %>%
distinct(biosample_accession, .keep_all = TRUE)
rhizophagus_biosample
## # A tibble: 38 × 9
## # Groups: qseqid [8]
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus biosa…³
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NODE_1507_lengt… dbj|G… 2.34e-53 747089 Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 2 NODE_1507_lengt… emb|C… 4.43e-54 588596 Rhizop… Eukary… unnam… Rhiz… BioSam…
## 3 NODE_1507_lengt… emb|C… 1.44e-54 588596 Rhizop… Eukary… unnam… Rhiz… BioSam…
## 4 NODE_1507_lengt… gb|EX… 1.8 e-50 747089… Rhizop… Eukary… hypot… Rhiz… BioSam…
## 5 NODE_1507_lengt… gb|PK… 1.19e-49 588596 Rhizop… Eukary… hypot… Rhiz… BioSam…
## 6 NODE_4293_lengt… dbj|G… 2.31e-22 747089 Rhizop… Eukary… hypot… Rhiz… BioSam…
## 7 NODE_4293_lengt… emb|C… 4.38e-38 588596 Rhizop… Eukary… unnam… Rhiz… BioSam…
## 8 NODE_4293_lengt… gb|EX… 7.89e-30 1432141 Rhizop… Eukary… hypot… Rhiz… BioSam…
## 9 NODE_4293_lengt… gb|PK… 7.6 e-38 588596 Rhizop… Eukary… hypot… Rhiz… BioSam…
## 10 NODE_4293_lengt… gb|PK… 1.12e-37 588596 Rhizop… Eukary… hypot… Rhiz… BioSam…
## # … with 28 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## # ³​biosample_accession
# Count the number of times each genus appears for a specific protein sequence
rhizophagus_biosample_count <- rhizophagus_biosample %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(rhizophagus_biosample_count)
## # A tibble: 8 × 3
## # Groups: qseqid, genus [8]
## qseqid genus count
## <chr> <chr> <int>
## 1 NODE_1507_length_35821_cov_5.845160_24 Rhizophagus 5
## 2 NODE_4293_length_18587_cov_5.277736_15 Rhizophagus 6
## 3 NODE_4708_length_17479_cov_5.389635_18 Rhizophagus 4
## 4 NODE_936_length_45786_cov_4.756839_31 Rhizophagus 2
## 5 PolB Rhizophagus 12
## 6 Polyprotein pp62 Rhizophagus 2
## 7 mRNA-decapping protein g5R Rhizophagus 3
## 8 mRNAc Rhizophagus 4
rhizophagus_biosample_ggplot <- ggplot(rhizophagus_biosample_count, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = dc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
rhizophagus_biosample_ggplot

Phytophthora - Homology due to Horizontal Gene Transfer?
phytophthora_pinky <- pinky_eukaryotes %>%
filter(genus == "Phytophthora")
phytophthora_pinky
## # A tibble: 57 × 8
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 DNA Ligase gb|KA… 2.97e- 24 29920 Phytop… Eukary… hypot… Phyt…
## 2 DNA Ligase gb|KA… 3.4 e- 31 164328 Phytop… Eukary… DNA l… Phyt…
## 3 DNA Ligase gb|KA… 4.01e- 24 164328 Phytop… Eukary… DNA l… Phyt…
## 4 NODE_10695_length_9982… gb|KA… 1.5 e- 11 109152 Phytop… Eukary… hypot… Phyt…
## 5 NODE_2150_length_28922… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt…
## 6 NODE_2150_length_28922… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt…
## 7 NODE_2150_length_28922… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt…
## 8 NODE_2150_length_28922… gb|KU… 5.03e-115 4790 Phytop… Eukary… Major… Phyt…
## 9 NODE_2150_length_28922… ref|X… 2.31e-114 761204 Phytop… Eukary… hypot… Phyt…
## 10 NODE_4293_length_18587… gb|KA… 1.38e- 27 221518 Phytop… Eukary… hypot… Phyt…
## # … with 47 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
phytophthora_pinkycounts <- phytophthora_pinky %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(phytophthora_pinkycounts)
## # A tibble: 10 × 3
## # Groups: qseqid, genus [10]
## qseqid genus count
## <chr> <chr> <int>
## 1 DNA Ligase Phytophthora 3
## 2 NODE_2150_length_28922_cov_4.964458_23 Phytophthora 5
## 3 NODE_795_length_49550_cov_5.527831_47 Phytophthora 2
## 4 NODE_936_length_45786_cov_4.756839_11 Phytophthora 5
## 5 NODE_936_length_45786_cov_4.756839_16 Phytophthora 26
## 6 NODE_936_length_45786_cov_4.756839_28 Phytophthora 2
## 7 NODE_936_length_45786_cov_4.756839_44 Phytophthora 3
## 8 RNAPL Phytophthora 3
## 9 RNAPS Phytophthora 2
## 10 VLTF3 Phytophthora 2
pc <- c("#7FBC41")
phytophthora_pinky_ggplot <- ggplot(phytophthora_pinkycounts, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = pc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_pinky_ggplot

phytophthora_pinky$sseqid
## [1] "gb|KAG6947535.1|" "gb|KAH7468958.1|" "gb|KAH7499174.1|"
## [4] "gb|KAG7394170.1|" "gb|ETI39689.1|" "gb|ETO68404.1|"
## [7] "gb|ETP37615.1|" "gb|KUG01557.1|" "ref|XP_008910449.1|"
## [10] "gb|KAG7375702.1|" "gb|KAF1783475.1|" "gb|ETL77478.1|"
## [13] "gb|ETP35848.1|" "gb|KAG1705391.1|" "gb|KAG7388034.1|"
## [16] "gb|KAG7391651.1|" "gb|KAH7470505.1|" "gb|OWZ13752.1|"
## [19] "dbj|GMF32380.1|" "gb|ETI47070.1|" "gb|ETK86998.1|"
## [22] "gb|ETL40412.1|" "gb|ETL93566.1|" "gb|KAE8887753.1|"
## [25] "gb|KAE8946961.1|" "gb|KAE8988664.1|" "gb|KAE8991089.1|"
## [28] "gb|KAF1781219.1|" "gb|KAF4045723.1|" "gb|KAG1686864.1|"
## [31] "gb|KAG1692341.1|" "gb|KAG2776868.1|" "gb|KAG3023315.1|"
## [34] "gb|KAG3114356.1|" "gb|KAG6604511.1|" "gb|KAG6972529.1|"
## [37] "gb|KAG7391747.1|" "gb|KAG7400365.1|" "gb|KUF97494.1|"
## [40] "gb|OWZ19253.1|" "gb|POM60565.1|" "ref|XP_002904741.1|"
## [43] "ref|XP_008905518.1|" "ref|XP_009520697.1|" "gb|KAG2967876.1|"
## [46] "gb|KAG3088486.1|" "gb|KAG6961386.1|" "gb|ETL86515.1|"
## [49] "gb|ETP37612.1|" "ref|XP_008910456.1|" "gb|ETL86504.1|"
## [52] "gb|KAF1785852.1|" "gb|KAG2759394.1|" "gb|ETO68411.1|"
## [55] "gb|ETP37619.1|" "gb|KAG2980116.1|" "gb|KAG3045077.1|"
library(rentrez)
# Function to retrieve BioSample accession for a given protein accession
get_biosample_accession_phytophthora <- function(phytophthoraPinky_accessions) {
if (startsWith(phytophthoraPinky_accessions, "ref")) {
return(phytophthoraPinky_accessions) # Return the protein accession directly
} else {
entrez_query <- paste0(phytophthoraPinky_accessions, "[ACCN]")
entrez_result <- entrez_fetch(db = "protein", id = entrez_query, rettype = "gb", retmode = "text")
biosample_accession_phytophthora <- regmatches(entrez_result, regexpr("BioSample: (\\w+)", entrez_result))
return(biosample_accession_phytophthora)
}}
# Example protein accession numbers
phytophthoraPinky_accessions <- c(
"gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|", "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|", "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|", "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|"
)
# Retrieve the BioSample accession for the given protein accessions
biosample_accessions_phytophthora <- sapply(phytophthoraPinky_accessions, get_biosample_accession_phytophthora)
# Print the result
print(biosample_accessions_phytophthora)
## gb|KAG6947535.1| gb|KAH7468958.1| gb|KAH7499174.1|
## "BioSample: SAMN17151089" "BioSample: SAMN19730089" "BioSample: SAMN19730093"
## gb|KAG7394170.1| gb|ETI39689.1| gb|ETO68404.1|
## "BioSample: SAMN17922340" "BioSample: SAMN01816558" "BioSample: SAMN01816559"
## gb|ETP37615.1| gb|KUG01557.1| ref|XP_008910449.1|
## "BioSample: SAMN01816557" "BioSample: SAMN04017959" "ref|XP_008910449.1|"
## gb|KAG7375702.1| gb|KAF1783475.1| gb|ETL77478.1|
## "BioSample: SAMN17922341" "BioSample: SAMN04633454" "BioSample: SAMN02178349"
## gb|ETP35848.1| gb|KAG1705391.1| gb|KAG7388034.1|
## "BioSample: SAMN01816557" "BioSample: SAMN09692924" "BioSample: SAMN17922340"
## gb|KAG7391651.1| gb|KAH7470505.1| gb|OWZ13752.1|
## "BioSample: SAMN17922341" "BioSample: SAMN19730089" "BioSample: SAMN04632436"
## dbj|GMF32380.1| gb|ETI47070.1| gb|ETK86998.1|
## "BioSample: SAMD00557650" "BioSample: SAMN01816558" "BioSample: SAMN02178347"
## gb|ETL40412.1| gb|ETL93566.1| gb|KAE8887753.1|
## "BioSample: SAMN02178348" "BioSample: SAMN02178349" "BioSample: SAMN07449681"
## gb|KAE8946961.1| gb|KAE8988664.1| gb|KAE8991089.1|
## "BioSample: SAMN07449687" "BioSample: SAMN07449691" "BioSample: SAMN07449690"
## gb|KAF1781219.1| gb|KAF4045723.1| gb|KAG1686864.1|
## "BioSample: SAMN04633454" "BioSample: SAMN13441166" "BioSample: SAMN09692924"
## gb|KAG1692341.1| gb|KAG2776868.1| gb|KAG3023315.1|
## "BioSample: SAMN09692924" "BioSample: SAMN06766401" "BioSample: SAMN07267863"
## gb|KAG3114356.1| gb|KAG6604511.1| gb|KAG6972529.1|
## "BioSample: SAMN07267868" "BioSample: SAMN16706909" "BioSample: SAMN17151088"
## gb|KAG7391747.1| gb|KAG7400365.1| gb|KUF97494.1|
## "BioSample: SAMN17922341" "BioSample: SAMN17922340" "BioSample: SAMN04017960"
## gb|OWZ19253.1| gb|POM60565.1| ref|XP_002904741.1|
## "BioSample: SAMN04632436" "BioSample: SAMN04632378" "ref|XP_002904741.1|"
## ref|XP_008905518.1| ref|XP_009520697.1| gb|KAG2967876.1|
## "ref|XP_008905518.1|" "ref|XP_009520697.1|" "BioSample: SAMN07267863"
## gb|KAG3088486.1| gb|KAG6961386.1| gb|ETL86515.1|
## "BioSample: SAMN07267868" "BioSample: SAMN17151088" "BioSample: SAMN02178349"
## gb|ETP37612.1| ref|XP_008910456.1| gb|ETL86504.1|
## "BioSample: SAMN01816557" "ref|XP_008910456.1|" "BioSample: SAMN02178349"
## gb|KAF1785852.1| gb|KAG2759394.1| gb|ETO68411.1|
## "BioSample: SAMN04633454" "BioSample: SAMN06766401" "BioSample: SAMN01816559"
## gb|ETP37619.1| gb|KAG2980116.1| gb|KAG3045077.1|
## "BioSample: SAMN01816557" "BioSample: SAMN07267863" "BioSample: SAMN07267864"
# Retrieve the BioSample accessions for the sseqid column
phytophthora_pinky$biosample_accession_phytophthora <- sapply(phytophthora_pinky$sseqid, get_biosample_accession_phytophthora)
# Print the updated data
print(phytophthora_pinky)
## # A tibble: 57 × 9
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus biosa…³
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 DNA Ligase gb|KA… 2.97e- 24 29920 Phytop… Eukary… hypot… Phyt… BioSam…
## 2 DNA Ligase gb|KA… 3.4 e- 31 164328 Phytop… Eukary… DNA l… Phyt… BioSam…
## 3 DNA Ligase gb|KA… 4.01e- 24 164328 Phytop… Eukary… DNA l… Phyt… BioSam…
## 4 NODE_10695_len… gb|KA… 1.5 e- 11 109152 Phytop… Eukary… hypot… Phyt… BioSam…
## 5 NODE_2150_leng… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt… BioSam…
## 6 NODE_2150_leng… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt… BioSam…
## 7 NODE_2150_leng… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt… BioSam…
## 8 NODE_2150_leng… gb|KU… 5.03e-115 4790 Phytop… Eukary… Major… Phyt… BioSam…
## 9 NODE_2150_leng… ref|X… 2.31e-114 761204 Phytop… Eukary… hypot… Phyt… ref|XP…
## 10 NODE_4293_leng… gb|KA… 1.38e- 27 221518 Phytop… Eukary… hypot… Phyt… BioSam…
## # … with 47 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## # ³​biosample_accession_phytophthora
# Removing repeated Biosample IDs.
phytophthora_pinky_biosample <- phytophthora_pinky %>%
group_by(qseqid) %>%
distinct(biosample_accession_phytophthora, .keep_all = TRUE)
phytophthora_pinky_biosample
## # A tibble: 56 × 9
## # Groups: qseqid [14]
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus biosa…³
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 DNA Ligase gb|KA… 2.97e- 24 29920 Phytop… Eukary… hypot… Phyt… BioSam…
## 2 DNA Ligase gb|KA… 3.4 e- 31 164328 Phytop… Eukary… DNA l… Phyt… BioSam…
## 3 DNA Ligase gb|KA… 4.01e- 24 164328 Phytop… Eukary… DNA l… Phyt… BioSam…
## 4 NODE_10695_len… gb|KA… 1.5 e- 11 109152 Phytop… Eukary… hypot… Phyt… BioSam…
## 5 NODE_2150_leng… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt… BioSam…
## 6 NODE_2150_leng… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt… BioSam…
## 7 NODE_2150_leng… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt… BioSam…
## 8 NODE_2150_leng… gb|KU… 5.03e-115 4790 Phytop… Eukary… Major… Phyt… BioSam…
## 9 NODE_2150_leng… ref|X… 2.31e-114 761204 Phytop… Eukary… hypot… Phyt… ref|XP…
## 10 NODE_4293_leng… gb|KA… 1.38e- 27 221518 Phytop… Eukary… hypot… Phyt… BioSam…
## # … with 46 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## # ³​biosample_accession_phytophthora
# List of protein accession numbers
phytophthora_accessions <- c("gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|", "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|", "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|", "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|")
# Retrieve protein sequences using NCBI Entrez
# Write protein sequences to a FASTA file
writeLines(phytophthora_accessions, "protein_sequences.faa")
library(rentrez)
# List of protein accession numbers
protein_accessions <- c("gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|", "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|", "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|", "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|")
# Retrieve protein sequences using NCBI Entrez
protein_sequences <- entrez_fetch(
db = "protein",
id = protein_accessions,
rettype = "fasta_cds_aa", # Use "fasta_cds_aa" to get protein sequences
retmode = "text"
)
# Write protein sequences to a .faa file
output_file <- "protein_sequences.faa"
writeLines(protein_sequences, output_file)
# Print a message indicating successful writing
cat("Protein sequences have been written to", output_file, "\n")
## Protein sequences have been written to protein_sequences.faa
# Count the number of times each genus appears for a specific protein sequence
phytophthora_biosample_count <- phytophthora_pinky_biosample %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep")
print(phytophthora_biosample_count)
## # A tibble: 14 × 3
## # Groups: qseqid, genus [14]
## qseqid genus count
## <chr> <chr> <int>
## 1 DNA Ligase Phytophthora 3
## 2 NODE_10695_length_9982_cov_5.336658_8 Phytophthora 1
## 3 NODE_2150_length_28922_cov_4.964458_23 Phytophthora 5
## 4 NODE_4293_length_18587_cov_5.277736_18 Phytophthora 1
## 5 NODE_5244_length_16314_cov_5.483917_3 Phytophthora 1
## 6 NODE_795_length_49550_cov_5.527831_47 Phytophthora 2
## 7 NODE_936_length_45786_cov_4.756839_11 Phytophthora 5
## 8 NODE_936_length_45786_cov_4.756839_16 Phytophthora 25
## 9 NODE_936_length_45786_cov_4.756839_28 Phytophthora 2
## 10 NODE_936_length_45786_cov_4.756839_36 Phytophthora 1
## 11 NODE_936_length_45786_cov_4.756839_44 Phytophthora 3
## 12 RNAPL Phytophthora 3
## 13 RNAPS Phytophthora 2
## 14 VLTF3 Phytophthora 2
phytophthora_biosample_ggplot <- ggplot(phytophthora_biosample_count, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = pc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_biosample_ggplot

# Count the number of times each genus appears for a specific protein sequence
phytophthora_biosample_repeats <- phytophthora_pinky_biosample %>%
group_by(biosample_accession_phytophthora) %>%
summarise(count = n(), .groups = "keep")
print(phytophthora_biosample_repeats)
## # A tibble: 34 × 2
## # Groups: biosample_accession_phytophthora [34]
## biosample_accession_phytophthora count
## <chr> <int>
## 1 BioSample: SAMD00557650 1
## 2 BioSample: SAMN01816557 4
## 3 BioSample: SAMN01816558 2
## 4 BioSample: SAMN01816559 2
## 5 BioSample: SAMN02178347 1
## 6 BioSample: SAMN02178348 1
## 7 BioSample: SAMN02178349 4
## 8 BioSample: SAMN04017959 1
## 9 BioSample: SAMN04017960 1
## 10 BioSample: SAMN04632378 1
## # … with 24 more rows
phytophthora_repeated <- ggplot(phytophthora_biosample_repeats, aes(x = biosample_accession_phytophthora, y = count)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = pc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_repeated

Bacteria HGT
pinky_bacteria <- pinky_duplicates %>%
filter(sskingdoms == "Bacteria") %>%
group_by(qseqid, sseqid, staxids) %>%
slice_min(order_by = evalue) %>%
ungroup() %>%
mutate(genus = word(sscinames, 1))
pinky_bacteria
## # A tibble: 922 × 8
## qseqid sseqid evalue staxids sscinames sskin…¹ stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 A32 gb|MAR56914.1| 9.91e- 16 2026788 Rickettsia… Bacter… hypot… Rick…
## 2 A32 gb|MCA9330243.1| 3.23e- 15 2026720 Candidatus… Bacter… hypot… Cand…
## 3 A32 gb|MCE7958853.1| 7.27e- 50 2293626 Acidobacte… Bacter… hypot… Acid…
## 4 A32 gb|MCK9607861.1| 1.86e- 30 418 Methylomon… Bacter… hypot… Meth…
## 5 A32 gb|MDE2097674.1| 2.09e- 48 2052139 Patescibac… Bacter… ATP-b… Pate…
## 6 A32 gb|NBP51235.1| 4.51e- 53 1883427 Actinomyce… Bacter… hypot… Acti…
## 7 A32 gb|NBP85256.1| 4.87e- 30 2448780 Mycobacter… Bacter… hypot… Myco…
## 8 A32 ref|WP_292571174.1| 1.35e- 30 418 Methylomon… Bacter… hypot… Meth…
## 9 D5 gb|MCK9604135.1| 2.11e-137 2035772 Candidatus… Bacter… PriCT… Cand…
## 10 D5 gb|MDE2099315.1| 6.3 e-150 2052139 Patescibac… Bacter… hypot… Pate…
## # … with 912 more rows, and abbreviated variable name ¹​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
pinky_bacteria_counts <- pinky_bacteria %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 2) # Only include those with more than one count
print(pinky_bacteria_counts)
## # A tibble: 75 × 3
## # Groups: qseqid, genus [75]
## qseqid genus count
## <chr> <chr> <int>
## 1 Deoxyuridine 5'-triphosphate nucleotidohydrolase Acidobacteriota 4
## 2 NODE_1151_length_41258_cov_5.297770_57 Alphaproteobacteria 5
## 3 NODE_1151_length_41258_cov_5.297770_57 Bacteroidota 8
## 4 NODE_1151_length_41258_cov_5.297770_57 Chitinophagaceae 5
## 5 NODE_1151_length_41258_cov_5.297770_57 Fluviispira 3
## 6 NODE_1151_length_41258_cov_5.297770_57 Mesorhizobium 3
## 7 NODE_1151_length_41258_cov_5.297770_57 Saprospiraceae 3
## 8 NODE_1151_length_41258_cov_5.297770_57 Silvanigrella 3
## 9 NODE_15262_length_7722_cov_5.326203_6 Deltaproteobacteria 4
## 10 NODE_15262_length_7722_cov_5.326203_6 Lujinxingia 8
## # … with 65 more rows
pinky_bacteria_ggplot <- ggplot(pinky_bacteria_counts, aes(x = genus, y = count)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = pc) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
pinky_bacteria_ggplot
